#############################################################################
# FigureC5.R
# Aim: to replicate Figure C5 in Atsusaka and Stevenson (2021)
# Run time: about 30 seconds
#############################################################################

rm(list=ls())
start <- Sys.time()
library(tidyverse)
library(stats)

###############################################################################################    
# (1) WRITING A FUNCTION
###############################################################################################   

cm.predictor <- function(formula, p, p2, data){
  
  i.logit <- function(XB){ exp(XB)/(1 + exp(XB))}  
  df <- model.frame(formula, data, na.action = na.omit)
  X0 <- model.matrix.default(formula, df) # Data matrix including A
  X1 <- X0[, 1:(dim(X0)[2]-2)]            # Matrix with 1 and predictors

  A <- df[, dim(X0)[2]]
  Y <- df[,dim(X0)[2]-1]
  V <- df[,1]          # Outcome variable
  p <- p               # Randomization probability in Crosswise Q
  p2 <- p2             # Randomization probability in Anchor Q
  
  k <- dim(X1)[2]      # Number of beta parameters ( #covariate + 1)
  k.t <- k + 1         # Start of theta parameters
  k.t.e <- 2*k         # End of theta parameters
  k.g <- k.t.e + 1     # Start of gamma parameters ( #coveriate + 2)
  k.g.e <- k.g + k + 1 # End of gamma parameters
  k.g.e_part <- k.g.e - 2
  
# par[k.g.e] = sigma parameter  
# par[k.g.e-1]  = gamma.cm (coef on Z)

  init <- c(rep(0.01,k.g.e-1),1)
  
#---------------------------------------------------------------------------------------#  
# PARAMTERS TO ESTIMATE (11 parameters, if two covariates)
# beta0, beta1, beta2, 
#  theta0, theta1, theta2, 
#  gamma0, gamma1, gamma2, gamma.cm (Z), sigma
#---------------------------------------------------------------------------------------#    

# LOG-LIKELIHOOD FUNCTION
log.L.pred <- function(par) {
  sum(log( (1/(par[k.g.e]*sqrt(2*pi)) )*exp( -(V - (X1 %*% par[k.g:k.g.e_part] + 1*par[k.g.e-1] ))^2/(2*(par[k.g.e]^2)) )
           * (i.logit(X1 %*% par[1:k])) * (i.logit(X1 %*% par[k.t:k.t.e]))  * (p^Y)*((1-p)^(1-Y)) * ((1-p2)^A)*(p2^(1-A))    
           
        +  (1/(par[k.g.e]*sqrt(2*pi)) )*exp( -(V - (X1 %*% par[k.g:k.g.e_part] + 0*par[k.g.e-1] ))^2/(2*(par[k.g.e]^2)) )
           * (1-(i.logit(X1 %*% par[1:k]))) * (i.logit(X1 %*% par[k.t:k.t.e])) * ((1-p)^Y)*(p^(1-Y))*((1-p2)^A)*(p2^(1-A))
        
        +  (1/(par[k.g.e]*sqrt(2*pi)) )*exp( -(V - (X1 %*% par[k.g:k.g.e_part] + 1*par[k.g.e-1] ))^2/(2*(par[k.g.e]^2)) )
           * (i.logit(X1 %*% par[1:k])) * (1 - i.logit(X1 %*% par[k.t:k.t.e])) * (1/4)
        
        +  (1/(par[k.g.e]*sqrt(2*pi)) )*exp( -(V - (X1 %*% par[k.g:k.g.e_part] + 0*par[k.g.e-1] ))^2/(2*(par[k.g.e]^2)) ) 
           * (1-(i.logit(X1 %*% par[1:k]))) * (1 - i.logit(X1 %*% par[k.t:k.t.e])) * (1/4)
  ))
}


# MAXIMIZATION
MLE = optim(par=init,                      # initial values for beta and theta
            fn = log.L.pred,               # function to maximize
            method = "BFGS",               # this method lets set lower bounds (Modified Newton method)
            control = list(maxit=800, fnscale = -1),
            hessian = TRUE)                # calculate Hessian matricce because we will need for confidence intervals

H = MLE$hessian                            # Hessian matrix
Var.hat = diag(-solve(H))                  # Variance as the negative inverse of the Hessian matrix (Dropping covariances)
SE = sqrt(Var.hat)                         # Standard errors


Mlist <- list()
Mlist[[1]] <- MLE$par
Mlist[[2]] <- SE

return(Mlist)
# Next: make pretty summary
}


#############################################################################
# (2) DATA GENERATING PROCESS
#############################################################################

i.logit <- function(XB){ exp(XB)/(1 + exp(XB))}
Nvec = c(250, 500, 1000, 2000, 3000, 4000, 5000)
est.gamma0 = NA
est.gamma1 = NA
est.gamma2 = NA
est.delta = NA
est.sigma = NA
est.gamma0.sd = NA
est.gamma1.sd = NA
est.gamma2.sd = NA
est.delta.sd = NA
est.sigma.sd = NA

beta0  = NA
beta1  = NA
beta2  = NA
theta0 = NA
theta1 = NA
theta2 = NA
beta0.sd  = NA
beta1.sd  = NA
beta2.sd  = NA
theta0.sd = NA
theta1.sd = NA
theta2.sd = NA

for(i in 1:length(Nvec)){
set.seed(1234)
                                             
N = Nvec[i] # Sample size

beta  = c(-1.5, 0.5, 0.02)                           # Unknown parameter (associating covariates and sensitive trais)
theta = c(2, -0.1, -0.01)                            # Unknown parameter (associating covariates and attentiveness)
female = rbinom(n=N, size=1, prob=0.5)               # 50% Male, 50% Female
age = rpois(n=N, lambda=30)                          # Age with mean 30
x = as.matrix(data.frame(rep(1, N), female, age))

XB =  x %*% beta                                     # Linear aggregator with betas
XT =  x %*% theta                                    # Linear aggregator with thetas
pi.x    = i.logit(XB)                                # Pi|Male
gamma.x = i.logit(XT)                                # Gamma|Male

p = 0.1                                              # Randomization probability
p2 = 0.15                                            # Randomization probability for the anchor question
Attentive = rbinom(n=N, size=1, prob=gamma.x)        # Attentive R or not

# glm(pi.x~x, family=binomial)                       # CHECK
# glm(gamma.x~x, family=binomial)                    # CHECK


# SENSITIVE QUESTION OF INTERST
StatementA = rbinom(n=N, size=1, prob=pi.x)          # Sesitive item
StatementB = rbinom(n=N, size=1, prob=p)             # Randomization item
True.res = ifelse(StatementA != StatementB, 0, 1)    # Survey answer 

Obs.res = rep(NA, N)
Obs.res[Attentive==1] = True.res[Attentive==1]       # Observed answer for attentive Rs
Obs.res[Attentive==0] = rbinom(n=length(N[Attentive==0]),
                               size=1, prob=0.5)     # Observed answer, otherwise


# NON-SENSITIVE ANCHOR QUESTION
StatementC = 0                                       # Attention "c"heck item (True prevalence is 0 for everyone)
StatementD = rbinom(n=N, size=1, prob=p2)            # Anchor question
True.res.prime = ifelse(StatementC != StatementD, 0, 1) # Survey answer in Anchor Question

Obs.res.prime = rep(NA, N)
Obs.res.prime[Attentive==1] = True.res.prime[Attentive==1] # Observed answer for attentive Rs
Obs.res.prime[Attentive==0] = rbinom(n=length(N[Attentive==0]),
                                     size=1, prob=0.5)# Observed answer, otherwise

Y = Obs.res
A = Obs.res.prime


# PARAMETERS associating covariates + sensitive latent trait with the OUTCOME variable (NORMAL)
gamma0 = 0
gamma1 = 0.3
gamma2 = 0.01
gamma.cm = 1
SD = 1

Z = StatementA                                    # Having a sensitive attribute

V = gamma0*rep(1,N) + gamma1*female + gamma2*age + gamma.cm*Z + rnorm(mean=0, sd=SD, n=N)   # Outcome
gamma <- c(gamma0, gamma1, gamma2, gamma.cm)


dat <- cbind(V, Y, female, A, p, p2)                          # Observed data
dat <- as.data.frame(dat) # Data Frame
head(dat) 

#------------------------------------#
# GROUND TRUTH
truepar <- c(beta, theta, gamma, SD)
#------------------------------------#

# par(mfrow=c(1,2))
# hist(pi.x, breaks=20)
# hist(gamma.x, breaks=20)


#############################################################################
# (3) VALIDATING THE MODEL
#############################################################################

result <- cm.predictor(V ~ female+age+Y+A, p=0.1, p2=0.15, data=dat)


est.gamma0[i] = result[[1]][7]
est.gamma1[i] = result[[1]][8]
est.gamma2[i] = result[[1]][9]
est.delta[i]  = result[[1]][10] 
est.sigma[i]  = result[[1]][11]
est.gamma0.sd[i] = result[[2]][7]
est.gamma1.sd[i] = result[[2]][8]
est.gamma2.sd[i] = result[[2]][9]
est.delta.sd[i]  = result[[2]][10] 
est.sigma.sd[i]  = result[[2]][11]


beta0[i]  = result[[1]][1]
beta1[i]  = result[[1]][2]
beta2[i]  = result[[1]][3]
theta0[i] = result[[1]][4]
theta1[i] = result[[1]][5]
theta2[i] = result[[1]][6]
beta0.sd[i]  = result[[2]][1]
beta1.sd[i]  = result[[2]][2]
beta2.sd[i]  = result[[2]][3]
theta0.sd[i] = result[[2]][4]
theta1.sd[i] = result[[2]][5]
theta2.sd[i] = result[[2]][6]


  
# CHECK WITH THE GROUND TRUTH
print(
rbind(
  .         = c("beta0", "beta1", "beta2", 
                "theta0", "theta1", "theta2", 
                "gamma0", "gamma1", "gamma2", 
                "gamma.cm", "sigma"),  
  Estimate = round(result[[1]], d=3), 
  Std.Error = round(result[[2]], d=3),
  Truth = round(truepar, d=2))
)

}


pdf("FigureC5.pdf", width=7, height=8)
par(oma=c(3,0,0,0), mfrow=c(2*2,3), mar=c(2,2,4,2)) # b,l,t,r
cole = "black"

plot(Nvec, est.gamma0, type="n", xlab="",ylab="",main="",ylim=c(-6,2), cex.main=1.5)
abline(h=truepar[7], lty=2, col="firebrick")
arrows(y0=est.gamma0-1.96*est.gamma0.sd, y1=est.gamma0+1.96*est.gamma0.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, est.gamma0, pch=16, cex=1.5)
title(expression(gamma[0]), line=1, cex.main=2)

plot(Nvec, est.gamma1, type="n", xlab="",ylab="",main="",ylim=c(-6,2), cex.main=1.5)
abline(h=truepar[8], lty=2, col="firebrick")
arrows(y0=est.gamma1-1.96*est.gamma1.sd, y1=est.gamma1+1.96*est.gamma1.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, est.gamma1, pch=16, cex=1.5)
title(expression(gamma[1]), line=1, cex.main=2)

plot(Nvec, est.gamma2, type="n", xlab="",ylab="",main="",ylim=c(-6,2), cex.main=1.5)
abline(h=truepar[9], lty=2, col="firebrick")
arrows(y0=est.gamma2-1.96*est.gamma2.sd, y1=est.gamma2+1.96*est.gamma2.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, est.gamma2, pch=16, cex=1.5)
title(expression(gamma[2]), line=1, cex.main=2)

plot(Nvec, est.delta, type="n", xlab="",ylab="",main="",ylim=c(-2,4), cex.main=1.5)
abline(h=truepar[10], lty=2, col="firebrick")
arrows(y0=est.delta-1.96*est.delta.sd, y1=est.delta+1.96*est.delta.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, est.delta, pch=16, cex=1.5)
title(expression(delta), line=1, cex.main=2)

plot(Nvec, est.sigma, type="n", xlab="",ylab="",main="",ylim=c(-6,2), cex.main=1.5)
abline(h=truepar[11], lty=2, col="firebrick")
arrows(y0=est.sigma-1.96*est.sigma.sd, y1=est.sigma+1.96*est.sigma.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, est.sigma, pch=16, cex=1.5)
title(expression(sigma), line=1, cex.main=2)


plot(0, xaxt = 'n', yaxt = 'n', bty = 'n', pch = '', ylab = '', xlab = '')

## END GAMMA, START BETA

plot(Nvec, beta0, type="n", xlab="",ylab="",main="",ylim=c(-6,2), cex.main=1.5)
abline(h=beta[1], lty=2, col="firebrick")
arrows(y0=beta0-1.96*beta0.sd, y1=beta0+1.96*beta0.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, beta0, pch=16, cex=1.5)
title(expression(beta[0]), line=1, cex.main=2)

plot(Nvec, beta1, type="n", xlab="",ylab="",main="",ylim=c(-6,2), cex.main=1.5)
abline(h=beta[2], lty=2, col="firebrick")
arrows(y0=beta1-1.96*beta1.sd, y1=beta1+1.96*beta1.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, beta1, pch=16, cex=1.5)
title(expression(beta[1]), line=1, cex.main=2)


plot(Nvec, beta2, type="n", xlab="",ylab="",main="",ylim=c(-6,2), cex.main=1.5)
abline(h=beta[3], lty=2, col="firebrick")
arrows(y0=beta2-1.96*beta2.sd, y1=beta2+1.96*beta2.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, beta2, pch=16, cex=1.5)
title(expression(beta[2]), line=1, cex.main=2)


plot(Nvec, theta0, type="n", xlab="",ylab="",main="",ylim=c(-2,4), cex.main=1.5)
abline(h=theta[1], lty=2, col="firebrick")
arrows(y0=theta0-1.96*theta0.sd, y1=theta0+1.96*theta0.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, theta0, pch=16, cex=1.5)
title(expression(theta[0]), line=1, cex.main=2)


plot(Nvec, theta1, type="n", xlab="",ylab="",main="",ylim=c(-6,2), cex.main=1.5)
abline(h=theta[2], lty=2, col="firebrick")
arrows(y0=theta1-1.96*theta1.sd, y1=theta1+1.96*theta1.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, theta1, pch=16, cex=1.5)
title(expression(theta[1]), line=1, cex.main=2)


plot(Nvec, theta2, type="n", xlab="",ylab="",main="",ylim=c(-6,2), cex.main=1.5)
abline(h=theta[3], lty=2, col="firebrick")
arrows(y0=theta2-1.96*theta2.sd, y1=theta2+1.96*theta2.sd,
       x0=Nvec, x1=Nvec, length=0, angle=0, col=cole, lwd=2)
points(Nvec, theta2, pch=16, cex=1.5)
title(expression(theta[2]), line=1, cex.main=2)


mtext(text="Sample Size",side=1,line=1,outer=TRUE)

dev.off()

end <- Sys.time()
start - end

#############################################################################
# END OF THIS R SOURCE FILE
#############################################################################